In [ ]:
import os
from os import path

# Third-party
from astropy.table import Table
import astropy.units as u
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
import h5py
import pandas as pd
import tqdm

from thejoker import JokerSamples

from twoface.config import TWOFACE_CACHE_PATH
from twoface.samples_analysis import unimodal_P
from twoface.db import (db_connect, AllStar, AllVisit, AllVisitToAllStar,
                        StarResult, Status, JokerRun, initialize_db)

In [ ]:
plot_path = '../../paper/1-catalog/figures/'
table_path = '../../paper/1-catalog/tables/'

In [ ]:
Session, _ = db_connect(path.join(TWOFACE_CACHE_PATH, 'apogee.sqlite'))
session = Session()

samples_file = path.join(TWOFACE_CACHE_PATH, 'apogee-jitter.hdf5')
control_samples_file = path.join(TWOFACE_CACHE_PATH, 'apogee-jitter-control.hdf5')

First, look at the control sample K percentiles


In [ ]:
with h5py.File(control_samples_file) as f:
    control_K = np.full((len(f.keys()), 256), np.nan)
    for i, key in enumerate(f):
        n_samples = len(f[key]['K'])
        control_K[i, :n_samples] = f[key]['K'][:]
        
ln_control_K = np.log(control_K)

In [ ]:
n_samples = np.sum(np.logical_not(np.isnan(control_K)), axis=1)
plt.hist(n_samples, bins=np.linspace(0, 256, 64));
plt.yscale('log')
plt.xlabel('$N$ samples returned')

How many are "needs mcmc" vs. "needs more prior samples":


In [ ]:
needs_mcmc = 0
needs_more_prior = 0
with h5py.File(control_samples_file) as f:
    keys = list(f.keys())
    for k in tqdm.tqdm(np.where(n_samples < 256)[0]):
        key = keys[k]
        data = AllStar.get_apogee_id(session, key).apogeervdata()
        samples = JokerSamples.from_hdf5(f[key])
        uni = unimodal_P(samples, data)
        
        if uni:
            needs_mcmc += 1
        else:
            needs_more_prior += 1

In [ ]:
needs_mcmc, needs_more_prior

Plot percentiles:


In [ ]:
fig, axes = plt.subplots(1, 2, figsize=(10, 5))

ax = axes[0]
for perc in [1, 5, 15]:
    control_perc = np.nanpercentile(ln_control_K, perc, axis=1)
    ax.hist(control_perc, bins=np.linspace(-12, 10, 64), 
            alpha=0.5, label='{0} percentile'.format(perc));

ax = axes[1]
for perc in [85, 95, 99]:
    control_perc = np.nanpercentile(ln_control_K, perc, axis=1)
    ax.hist(control_perc, bins=np.linspace(-12, 10, 64), 
            alpha=0.5, label='{0} percentile'.format(perc));
    
for ax in axes:
    ax.legend(loc='best', fontsize=14)
    ax.set_xlabel(r'$\ln \left(\frac{K}{{\rm km}\,{s}^{-1}} \right)$')
    ax.set_yscale('log')
    
axes[0].set_title('control sample')
fig.tight_layout()

In [ ]:
# cut = -0.88 # 5% FPR
cut = -0.12 # 1% FPR
np.sum(np.nanpercentile(ln_control_K, 1., axis=1) > cut) / control_K.shape[0]

Summary: if we cut at $\ln K > -0.12$, 1% false-positive rate


Compute percentiles in lnK for all stars

Write a table with APOGEE_ID, percentile value:


In [ ]:
df = pd.read_hdf('../../cache/apogee-jitter-tbl.hdf5')
grouped = df.groupby('APOGEE_ID')
df.columns

In [ ]:
K_per = grouped.agg(lambda x: np.percentile(np.log(x['K']), 1))['K']
(K_per > cut).sum()

In [ ]:
high_K_tbl = Table()
high_K_tbl['APOGEE_ID'] = np.asarray(K_per.index).astype('U20')
high_K_tbl['lnK_per_1'] = np.asarray(K_per)
high_K_tbl.write(path.join(table_path, 'lnK-percentiles.fits'), overwrite=True)

In [ ]:
high_K_tbl[:8].write(path.join(table_path, 'lnK-percentiles.tex'), overwrite=True)

Now define the High-$K$ sample:


In [ ]:
high_K = np.asarray(K_per[K_per > cut].index).astype('U20')
len(high_K)

In [ ]:
for apogee_id in tqdm.tqdm(high_K):
    star = AllStar.get_apogee_id(session, apogee_id)
    res = star.results[0] # only one result...
    res.high_K = True
session.commit()

In [ ]:
_N = session.query(AllStar).join(StarResult).filter(StarResult.high_K).count()
print(_N)
assert _N == len(high_K)

In [ ]:
fig, ax = plt.subplots(1, 1, figsize=(6, 5))

control_perc = np.nanpercentile(ln_control_K, 1, axis=1)
bins = np.linspace(-12, 10, 64)
ax.hist(K_per, bins=bins, 
        alpha=1, label='APOGEE sample', normed=True,
        histtype='stepfilled', rasterized=True)

ax.hist(control_perc, bins=bins, 
        alpha=1, label='Control sample', normed=True, 
        histtype='step', linewidth=2, color='#333333')

ax.legend(loc='best', fontsize=13)

ax.set_yscale('log')
ax.set_xlabel(r'$\ln \left(\frac{K}{{\rm km}\,{s}^{-1}} \right)$')
ax.set_ylabel('density')

ax.axvline(cut, linestyle='--', zorder=10, alpha=1., color='tab:orange')

fig.tight_layout()
fig.savefig(path.join(plot_path, 'lnK-percentiles.pdf'), dpi=250)

In [ ]: